Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30112
## Number of samples: 80 (45 ASD, 35 controls)
Since there are more genes than samples, we can perform PCA and reduce the dimension from 30K to 80 without losing any information
pca = prcomp(t(datExpr))
datExpr_redDim = pca$x %>% data.frame
clusterings = list()
No recognisable best k, so chose k=4
set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=9 as best number of clusters.
h_clusts = datExpr_redDim %>% dist %>% hclust %>% as.dendrogram
h_clusts %>% plot
abline(h=126, col='blue')
best_k = 9
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Younger ASD samples seem to be more similar to control samples than older ASD ones. Perhaps there’s a relation between age and severity of ASD because diagnosis are more inclusive nowadays, identifying mild autism cases that weren’t recognised before, and most of these new diagnosis are made on kids?
Colors:
Diagnosis: Blue=control, Green=ASD
Sex: Pink=Female, Blue=Male
Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital
Age: Purple=youngest, Yellow=oldest
clusterings[['hc']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_age = datMeta$Age %>% min
max_age = datMeta$Age %>% max
viridis_age_cols = viridis(max_age - min_age + 1)
names(viridis_age_cols) = seq(min_age, max_age)
return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()
dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>%
mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'), # Pink Female, Blue Male
'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D', # ggplot defaults for 4 colours
Brain_lobe=='Temporal'~'#7CAE00',
Brain_lobe=='Parietal'~'#00BFC4',
Brain_lobe=='Occipital'~'#C77CFF'),
'Age' = viridis_age_cols[as.character(Age)]) %>% # Purple: young, Yellow: old
dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Chose the best clustering to be with k=6
*The rest of the output plots can be found in the Data/Gandal/consensusClusterng/samples folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance (The 9 first components explain 60% of the variance, but decided to keep the first 26 to accumulate 80% of the variance)
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign obs to genes with FDR<0.01 using the fdrtool package
Not a good method for clustering samples because:
ICA does not perform well with small samples (see Figure 4 of this paper)
Still, it leaves only 14 samples without a cluster
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3
## 14 45 13 8
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
This method doesn’t work:
Using the PCA reduced expression dataset give a very erratic \(R^2\). Choosing the best power (7) as well as the second best (5) ends up leaving all genes without cluster
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = 1:20)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.00285 0.0763 0.3560 14.30000 1.36e+01 21.6000
## 2 2 0.32900 -0.5700 0.4640 3.94000 3.54e+00 8.0000
## 3 3 0.40400 -0.7830 0.3660 1.37000 1.18e+00 3.4900
## 4 4 0.78500 -0.7560 0.8640 0.55600 4.36e-01 1.6400
## 5 5 0.79500 -0.8980 0.7360 0.25800 1.71e-01 0.8760
## 6 6 0.20800 -2.8400 0.0684 0.13300 6.63e-02 0.5860
## 7 7 0.85500 -1.1400 0.8140 0.07490 2.85e-02 0.4150
## 8 8 0.15000 -2.1200 -0.0894 0.04560 1.33e-02 0.3050
## 9 9 0.06070 -1.6300 -0.0457 0.02940 6.28e-03 0.2310
## 10 10 0.08230 -1.3300 -0.0747 0.01990 2.87e-03 0.1780
## 11 11 0.02750 -0.9650 0.0595 0.01400 1.44e-03 0.1390
## 12 12 0.11900 -2.1600 -0.0125 0.01010 7.67e-04 0.1100
## 13 13 0.07210 -1.5500 0.0236 0.00751 3.98e-04 0.0875
## 14 14 0.05230 -1.4800 0.1530 0.00565 1.99e-04 0.0701
## 15 15 0.11700 -2.2900 0.0592 0.00431 9.48e-05 0.0564
## 16 16 0.12900 -2.3300 0.0131 0.00333 4.56e-05 0.0455
## 17 17 0.16500 -2.5000 0.0236 0.00259 2.28e-05 0.0369
## 18 18 0.08700 -1.8900 0.1140 0.00203 1.15e-05 0.0299
## 19 19 0.09750 -1.9300 0.1030 0.00160 5.83e-06 0.0243
## 20 20 0.15200 -2.3200 0.0343 0.00126 3.04e-06 0.0198
Running WGCNA with power=7
# Best power
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
## Warning in blockwiseModules(., power = best_power$powerEstimate, numericLabels = TRUE): blockwiseModules: mergeCloseModules failed with the following error message:
## Error in mergeCloseModules(datExpr, colors[gsg$goodGenes], cutHeight = mergeCutHeight, :
## Error in moduleEigengenes(expr = exprData[[set]]$data, colors = setColors, :
## Color levels are empty. Possible reason: the only color is grey and grey module is excluded from the calculation.
##
##
## --> returning unmerged colors.
Cluster distribution
table(network$colors)
##
## 0
## 80
Running WGCNA with power=5
# 2nd best
network = datExpr_redDim %>% t %>% blockwiseModules(power=5, numericLabels=TRUE)
## Warning in blockwiseModules(., power = 5, numericLabels = TRUE): blockwiseModules: mergeCloseModules failed with the following error message:
## Error in mergeCloseModules(datExpr, colors[gsg$goodGenes], cutHeight = mergeCutHeight, :
## Error in moduleEigengenes(expr = exprData[[set]]$data, colors = setColors, :
## Color levels are empty. Possible reason: the only color is grey and grey module is excluded from the calculation.
##
##
## --> returning unmerged colors.
Cluster distribution
table(network$colors)
##
## 0
## 80
Using the original expression dataset, the \(R^2\) threshold is never achieved, altohugh it gets closest at 11, but still doesn’t manage to find any clusters within the data
best_power = datExpr %>% pickSoftThreshold(powerVector = 1:30)
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.289 247.00 0.1000 76.5 76.7 77.2
## 2 2 0.286 123.00 0.0953 74.1 74.5 75.5
## 3 3 0.282 81.00 0.0898 71.7 72.4 73.8
## 4 4 0.286 61.10 0.0901 69.5 70.3 72.2
## 5 5 0.283 48.60 0.0858 67.3 68.3 70.6
## 6 6 0.280 40.40 0.0812 65.2 66.4 69.0
## 7 7 0.279 34.30 0.0784 63.2 64.5 67.5
## 8 8 0.276 29.80 0.0746 61.3 62.8 66.0
## 9 9 0.273 26.40 0.0701 59.4 61.0 64.6
## 10 10 0.275 24.00 0.0714 57.6 59.4 63.1
## 11 11 0.786 6.89 0.8230 55.9 57.8 61.8
## 12 12 0.759 6.35 0.7990 54.2 56.2 60.4
## 13 13 0.786 5.84 0.8200 52.6 54.7 59.1
## 14 14 0.780 5.40 0.8070 51.1 53.2 57.8
## 15 15 0.729 4.88 0.7500 49.6 51.8 56.6
## 16 16 0.730 4.58 0.7510 48.1 50.5 55.3
## 17 17 0.694 4.35 0.7210 46.7 49.1 54.1
## 18 18 0.695 4.11 0.7210 45.4 47.8 53.0
## 19 19 0.697 3.87 0.7170 44.1 46.6 51.8
## 20 20 0.707 3.68 0.7250 42.8 45.3 50.7
## 21 21 0.729 3.31 0.7880 41.6 44.1 49.6
## 22 22 0.724 3.13 0.7770 40.5 43.0 48.6
## 23 23 0.612 2.94 0.6550 39.3 41.9 47.6
## 24 24 0.626 2.74 0.6780 38.2 40.8 46.5
## 25 25 0.610 2.59 0.6560 37.2 39.7 45.6
## 26 26 0.612 2.50 0.6570 36.1 38.7 44.6
## 27 27 0.614 2.41 0.6570 35.1 37.7 43.6
## 28 28 0.614 2.33 0.6570 34.2 36.7 42.7
## 29 29 0.708 2.15 0.7730 33.3 35.8 41.8
## 30 30 0.756 2.03 0.8180 32.4 34.8 40.9
Running WGCNA with power=11
# Best power
network = datExpr %>% blockwiseModules(power=11, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 1
## ..there is nothing to merge.
Cluster distribution
table(network$colors)
##
## 1
## 80
Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 groups following the best k from K-means because the methods are similar
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 4 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, best_k, ICA_output, ICA_clusters_names,
signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta,
best_power, c, viridis_age_cols, create_viridis_dict)
Using Adjusted Rand Index:
K-means, Hierarchical Clustering and Consensus Clustering and Gaussian Mixtures seem to give relatively similar clusterings
K-means and Gaussian Mixtures are the most similar clusterings, which makes sense because the two methods are similar
ASD seems to be captured best by Gaussian Mixtures (although it’s not that good)
ICA seems to be the best clustering to capture Age (although it’s not that good)
No clusterings were able to capture any type of Sex or Brain Region structure
clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['Subject']] = datMeta$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta$Diagnosis_
clusters_plus_phenotype[['Region']] = datMeta$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta$Sex
clusters_plus_phenotype[['Age']] = datMeta$Age
cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
cluster1 = as.factor(clusters_plus_phenotype[[i]])
for(j in (i):length(clusters_plus_phenotype)){
cluster2 = as.factor(clusters_plus_phenotype[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), Subject_ID = datMeta$Subject_ID,
KMeans = as.factor(clusterings[['km']]), Hierarchical = as.factor(clusterings[['hc']]),
Consensus = as.factor(clusterings[['cc']]), ICA = as.factor(clusterings[['ICA_min']]),
GMM = as.factor(clusterings[['GMM']]),
Sex = as.factor(datMeta$Sex), Region = as.factor(datMeta$Brain_lobe),
Diagnosis = as.factor(datMeta$Diagnosis_), Age = datMeta$Age)
selectable_scatter_plot(plot_points, plot_points)